Multiplex assessment of protein variant abundance by massively parallel sequencing VAMP-seq - multiplex assay that uses fluorescent reporters to measure the steady-state abundance of protein variants in cultured human cells (each cellexpresses a single variant directly fused to EGFP…the stability of the variant dictates the abundance of the EGFP fusion and, accordingly, the green fluorescence signal of the cell) - used to assess PTEN and TPMT variants
#plots VAMP-seq score vs abundance_class
VAMP_abundance <- ggplot(ff, aes(x=abundance_class, y=score, fill=protein)) + geom_violin(draw_quantiles = 0.5)+ylab("VAMP-seq score")+xlab("Abundance Class")+theme(legend.title=element_blank(), panel.grid.major = element_line(colour = "grey"), panel.grid.minor = element_line(colour = "grey"))+ggtitle("VAMP-seq scores for each abundance classification")+geom_point(data=data.frame(x="wt-like", y=1, protein = "PTEN"), aes(x,y), colour="black", size=1.5, show.legend=FALSE)+annotate("text", x = "wt-like", y=1.09, label = "WT",colour= "black", size = 4) + scale_y_continuous(minor_breaks = seq(-2, 2, .25))+theme_bw()
plot(VAMP_abundance)

# graph VAMP-seq scores relative to variant position in protein
#pten
pten1_proc_wt <- pten1_proc[!is.na(pten1_proc$position),]
pten1_proc_wt$secondary_struct <- ifelse(is.na(pten1_proc_wt$helix), "unknown",
ifelse(pten1_proc_wt$helix==1, "helix",
ifelse(pten1_proc_wt$sheet==1, "sheet",
ifelse(pten1_proc_wt$helix==0, "neither",
"unknown"))))
pten_pos <- ggplot(pten1_proc_wt, aes(x=position, y=score, colour=secondary_struct))+ geom_point(size=.3) + scale_x_continuous(minor_breaks = seq(0, 420, 5)) + scale_color_manual(values=c("#FF4848", "#00C853", "#5757FF", "#A9A9A9")) +ylab("VAMP-seq score")+xlab("Position in PTEN")+labs(colour="Secondary Structure")+ggtitle("PTEN scores in relation to protein structure") + geom_vline(xintercept=27, color="black", size=.1) + geom_vline(xintercept=55, color="black", size=.1) + geom_vline(xintercept=70, color="black", size=.1) + geom_vline(xintercept=85, color="black", size=.1) + geom_vline(xintercept=164.5, color="black", size=.1) + geom_vline(xintercept=212, color="black", size=.1) + geom_vline(xintercept=267.5, color="black", size=.1) + geom_vline(xintercept=343.5, color="black", size=.1)+theme_bw()
pten_hydro <- ggplot(pten1_proc_wt, aes(x=position, y=score, colour=(hydro2-hydro1)))+ geom_point(size=.3, alpha = 0.3) + scale_x_continuous(minor_breaks = seq(0, 420, 5)) + ylab("VAMP-seq score")+xlab("Position in PTEN")+labs(colour="Hydrophobicity")+ggtitle("PTEN scores in relation to change in hydrophobicity") + geom_vline(xintercept=27, color="black", size=.1) + geom_vline(xintercept=55, color="black", size=.1) + geom_vline(xintercept=70, color="black", size=.1) + geom_vline(xintercept=85, color="black", size=.1) + geom_vline(xintercept=164.5, color="black", size=.1) + geom_vline(xintercept=212, color="black", size=.1) + geom_vline(xintercept=267.5, color="black", size=.1) + geom_vline(xintercept=343.5, color="black", size=.1)
#tpmt
tpmt1_proc_wt <- tpmt1_proc[!is.na(tpmt1_proc$position),]
tpmt1_proc_wt$secondary_struct <- ifelse(is.na(tpmt1_proc_wt$helix), "unknown",
ifelse(tpmt1_proc_wt$helix==1, "helix",
ifelse(tpmt1_proc_wt$sheet==1, "sheet",
ifelse(tpmt1_proc_wt$helix==0, "neither",
"unknown"))))
tpmt_pos <- ggplot(tpmt1_proc_wt, aes(x=position, y=score, colour=secondary_struct))+ geom_point(size=.3) + scale_x_continuous(minor_breaks = seq(0, 405, 5)) + scale_color_manual(values=c("#FF4848", "#00C853", "#5757FF", "#A9A9A9")) +ylab("VAMP-seq score")+xlab("Position in TPMT")+labs(colour="Secondary Structure")+ggtitle("TPMT scores in relation to protein structure") + geom_vline(xintercept=47, color="black", size=.1) + geom_vline(xintercept=78, color="black", size=.1) + geom_vline(xintercept=122.5, color="black", size=.1) + geom_vline(xintercept=140, color="black", size=.1) + geom_vline(xintercept=165, color="black", size=.1) + geom_vline(xintercept=194, color="black", size=.1) + geom_vline(xintercept=209, color="black", size=.1)+theme_bw()
tpmt_colors <- tpmt1_proc_wt
#[order(position, variant),]
tpmt_colors$fact <- rep(10, nrow(tpmt_colors))
temp <- 1
for(i in 1:(length(tpmt_colors$fact)-1)) {
if (tpmt_colors$secondary_struct[i] != tpmt_colors$secondary_struct[i+1]) {
tpmt_colors$fact[i] <- temp
temp <- temp + 1
} else {
tpmt_colors$fact[i] <- temp
}
}
tpmt_colors$fact[length(tpmt_colors$fact)] <- temp
# cc <- 0
# for(i in 1:(length(tpmt_colors$fact)-1)) {
# if (tpmt_colors$fact[i] != tpmt_colors$fact[i+1]) {
# print(cc)
# cc <- 0
# } else {
# cc <- cc + 1
# }
# }
tpmt_pos_vp <- ggplot(tpmt_colors, aes(x=position, y=score))+ geom_violin(data=tpmt_colors[c(1:2783, 2798:4000),], aes(fill=as.character(fact), colour = factor(TRUE)), draw_quantiles = c(0.5), scale = "width") +
scale_fill_manual(values=c("1" = "#A9A9A9", "2" = "#00C853", "3" = "#FF4848", "4" = "#00C853","5" = "#FF4848", "6" = "#00C853","7" = "#5757FF", "8" = "#00C853","9" = "#FF4848","10" = "#00C853","11" = "#5757FF", "12" = "#00C853","13" = "#FF4848", "14" = "#00C853", "15" = "#5757FF", "16" = "#00C853", "17" = "#5757FF", "18" = "#00C853", "19" = "#5757FF", "20" = "#00C853", "21" = "#5757FF", "22" = "#00C853", "23" = "#FF4848", "24" = "#5757FF", "25" = "#00C853", "26" = "#FF4848", "27" = "#00C853", "28" = "#5757FF", "29" = "#00C853", "30" = "#5757FF", "31" = "#00C853")) + scale_colour_manual(values = c("black")) + scale_x_continuous(minor_breaks = seq(0, 405, 5)) + ylab("VAMP-seq score")+xlab("Position in TPMT")+labs(colour="Secondary Structure")+ggtitle("TPMT scores in relation to protein structure") + geom_vline(xintercept=47, color="black", size=.1) + geom_vline(xintercept=78, color="black", size=.1) + geom_vline(xintercept=122.5, color="black", size=.1) + geom_vline(xintercept=140, color="black", size=.1) + geom_vline(xintercept=165, color="black", size=.1) + geom_vline(xintercept=194, color="black", size=.1) + geom_vline(xintercept=209, color="black", size=.1)
pten_hydro1 <- ggplot(pten1_proc_wt, aes(y=score, x=(hydro2-hydro1)))+ geom_point(size=0.5, alpha = 0.3) + ylab("Hydrophobicity")+xlab("VAMP-seq score")+ggtitle("PTEN scores in relation to change in hydrophobicity")
pten_aa_spread <- ggplot(pten1_proc_wt[2:288,], aes(y=score, x=start)) + geom_violin(draw_quantiles=c(0.5))
pten_aa_spread1 <- ggplot(pten1_proc_wt[2:288,], aes(y=score, x=start)) + geom_point(size = 0.5)
plot(pten_pos)

plot(tpmt_pos)

#plot(tpmt_pos_vp)
#plot(pten_hydro)
#plot(pten_hydro1)
#plot(pten_aa_spread)
#plot(pten_aa_spread1)
# graph VAMP-seq scores relative to variant position in protein
#pten
pten1_hbond <- pten1_proc[!is.na(pten1_proc$hbond_sum),]
pten1_hbond$secondary_struct <- ifelse(is.na(pten1_hbond$helix), "unknown",
ifelse(pten1_hbond$helix==1, "helix",
ifelse(pten1_hbond$sheet==1, "sheet",
ifelse(pten1_hbond$helix==0, "neither",
"unknown"))))
pten_plot_hbond <- ggplot(pten1_hbond, aes(x=hbond_sum, y=score, colour=secondary_struct))+ geom_point(alpha=0.4) + ylab("VAMP-seq score")+xlab("DSSP Sum of hydrogen bonds")+ggtitle("PTEN scores in relation to hydrogen bonding") + scale_color_manual(values=c("#FF4848", "#696969", "#5757FF")) + labs(colour="Secondary Structure")
plot(pten_plot_hbond)

pten_plot_hbond1 <- ggplot(pten1_hbond, aes(x=hbond_sum, y=score))+ geom_point(alpha = 0.2) + ylab("VAMP-seq score")+xlab("DSSP Sum of hydrogen bonds")+ggtitle("PTEN scores in relation to hydrogen bonding")
# was in aes, ggplot function call ---> colour=secondary_struct
#scale_color_manual(values=c("#FF4848", "#00C853", "#5757FF", "#A9A9A9")) + labs(colour="Secondary Structure")+
plot(pten_plot_hbond1)

#less hydrogen bonds ~ higher abundance
#TPMT
tpmt_sum <- summarySE(tpmt1_proc_wt, measurevar="score", groupvars="position")
#head(tpmt_sum)
tpmt_pos_mean <- ggplot(tpmt_sum, aes(x=position, y=score))+ geom_bar(position=position_dodge(), stat="identity", colour="#999999") + geom_errorbar(aes(ymin=score-sd, ymax = score+sd), width=1, size=0.3, position=position_dodge()) +ylab("VAMP-seq score")+theme(axis.title.x = element_blank(), axis.text.x = element_blank()) + scale_x_continuous(breaks = seq(0, 245, 10), expand = c(0,0)) + scale_y_continuous(expand = c(0,0))
#factor(position) is getting rid of some positions altogether on the graph
#tpmt_pos_mean <- ggplot(tpmt_sum, aes(x=factor(position), y=score))+ geom_bar(position=position_dodge(), stat="identity", colour="#999999") + geom_errorbar(aes(ymin=score-sd, ymax = score+sd), width=0.001, position=position_dodge()) +ylab("VAMP-seq score")+theme(axis.title.x = element_blank(), axis.text.x = element_blank()) + scale_x_discrete(breaks = seq(0, 250, 10))
tpmt_heat <- ggplot(tpmt1_proc_wt, aes(position, end)) + geom_tile(aes(fill=score)) + scale_fill_gradientn(colours = c("#3F7CB9", "#FFEAF3", "#B21F4E"), values = scales::rescale(c(-0.7, 0.2, 1, 1.3, 2.03)))+ scale_x_continuous(breaks = seq(0, 245, 10), expand=c(0,0)) + theme(legend.position='bottom')+xlab("Position in TPMT") + scale_y_discrete(expand = c(0,0))
#scale_fill_gradient2(low="#3F7CB9", mid="white", high="#B21F4E", midpoint=1)
#scale_fill_distiller(palette= "RdBu")
tpmt_dssp_schematic <- ggplot() + ggtitle("TPMT mean abundance scores") +
geom_segment(aes(x = 1, y = 0, xend = max(tpmt1_data$position)), yend = 0, size = 1, color = "grey70") +
geom_point(data = tpmt1_data, aes(x = position, y = 0), color = "black", size = 1.8) +
geom_point(data = subset(tpmt1_data, sheet == 1), aes(x = position, y = 0), color = "pink", size = 1.5) +
geom_point(data = subset(tpmt1_data, helix == 1), aes(x = position, y = 0), color = "cyan", size = 1.5) +
scale_x_continuous(breaks = seq(0, 245, 10), expand = c(0,0)) +
scale_y_continuous(breaks = NULL, expand = c(0,0)) +xlab(NULL) + ylab(NULL) +
theme(panel.border = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank())
#plots
plot(tpmt_pos_mean)

plot(tpmt_heat)

tpmt_dssp_schematic

#+ geom_vline(xintercept=185, color="black", size=.1) + geom_vline(xintercept=350, color="black", size=.1)
#grouping all variants in the same secondary structure together
tpmt_aa_sum <- summarySE(tpmt_colors, measurevar="score", groupvars="fact")
tpmt_aa_mean <- ggplot(tpmt_aa_sum, aes(x=fact, y=score))+ geom_bar(position=position_dodge(), stat="identity", colour="#999999") + geom_errorbar(aes(ymin=score-sd, ymax = score+sd), width=1, size=0.3, position=position_dodge()) +ylab("VAMP-seq score")+theme(axis.text.x = element_blank()) + scale_x_discrete(breaks = seq(0, 245, 10)) + xlab("each bar is a different secondary structure")
plot(tpmt_aa_mean)

#PTEN
pten_sum <- summarySE(pten1_proc_wt, measurevar="score", groupvars="position")
NaNs produced
#head(pten_sum)
pten_pos_mean <- ggplot(pten_sum, aes(x=position, y=score))+ geom_bar(position=position_dodge(), stat="identity", colour="#999999") + geom_errorbar(aes(ymin=score-sd, ymax = score+sd), width=1, size=0.3, position=position_dodge()) +ylab("VAMP-seq score")+theme(axis.title.x = element_blank(), axis.text.x = element_blank()) + scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) + scale_y_continuous(expand = c(0,0)) + geom_vline(xintercept=185, color="black", size=.1) + geom_vline(xintercept=350, color="black", size=.1)
pten_heat <- ggplot(pten1_proc_wt, aes(position, end)) + geom_tile(aes(fill=score)) + scale_fill_gradientn(colours = c("#3F7CB9", "#FFEAF3", "#B21F4E"), values = scales::rescale(c(-0.23, 0.42, 1, 1.2, 1.47)))+ scale_x_continuous(breaks = seq(0, 403, 20), expand=c(0,0)) + theme(legend.position='bottom')+xlab("Position in TPMT") + scale_y_discrete(expand = c(0,0))
#c(-0.7, 0.2, 1, 1.3, 2.03)
#c(-0.23, 0.42, 1, 1.2, 1.47)
pten_extra <- read.table(file = '~/leklab/leklab/PTEN_positional_data.tsv', sep = '\t', header = TRUE)
pten_dssp_schematic <- ggplot() + ggtitle("PTEN mean abundance scores") +
geom_segment(aes(x = 1, y = 0, xend = max(pten_extra$position)), yend = 0, size = 1, color = "grey70") +
geom_point(data = subset(pten_extra, !is.na(xca)), aes(x = position, y = 0), color = "black", size = 1.8) +
geom_point(data = subset(pten_extra, sheet == 1), aes(x = position, y = 0), color = "pink", size = 1.5) +
geom_point(data = subset(pten_extra, helix == 1), aes(x = position, y = 0), color = "cyan", size = 1.5) +
scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) +
scale_y_continuous(breaks = NULL, expand = c(0,0)) +xlab(NULL) + ylab(NULL) +
theme(panel.border = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank())
#plots
plot(pten_pos_mean)

plot(pten_heat)

pten_dssp_schematic

#method1 (basic, for visualizing in rstudio)
grid.newpage()
grid.draw(rbind(ggplotGrob(tpmt_dssp_schematic), ggplotGrob(tpmt_pos_mean), ggplotGrob(tpmt_heat), size = "last"))
Removed 1 rows containing missing values (geom_segment).Removed 1 rows containing missing values (geom_point).

grid.newpage()
grid.draw(rbind(ggplotGrob(pten_dssp_schematic), ggplotGrob(pten_pos_mean), ggplotGrob(pten_heat), size = "last"))
Removed 12 rows containing missing values (geom_errorbar).

#method2 (use for final layout, size specification, download)
gA=ggplot_gtable(ggplot_build(tpmt_pos_mean))
gB=ggplot_gtable(ggplot_build(tpmt_heat))
gC=ggplot_gtable(ggplot_build(tpmt_dssp_schematic))
Removed 1 rows containing missing values (geom_segment).Removed 1 rows containing missing values (geom_point).
ga=ggplot_gtable(ggplot_build(pten_pos_mean))
Removed 12 rows containing missing values (geom_errorbar).
gb=ggplot_gtable(ggplot_build(pten_heat))
gc=ggplot_gtable(ggplot_build(pten_dssp_schematic))
maxWidth = grid::unit.pmax(gA$widths, gB$widths, gC$widths, ga$widths, gb$widths, gc$widths)
gA$widths <- as.list(maxWidth)
gB$widths <- as.list(maxWidth)
gC$widths <- as.list(maxWidth)
ga$widths <- as.list(maxWidth)
gb$widths <- as.list(maxWidth)
gc$widths <- as.list(maxWidth)
grid.newpage()
#storing, with specified widths!!
pdf('pten_tpmt_mean_heat.pdf', width=8, height=6)
grid.arrange(arrangeGrob(gC,gA,gB,nrow=3,heights=c(.1,.3,.8)))
grid.arrange(arrangeGrob(gc,ga,gb,nrow=3,heights=c(.1,.3,.8)))
dev.off()
quartz_off_screen
2

#plotting mean score vs variant changed to
tpmt_end_sum <- summarySE(tpmt1_proc_wt, measurevar="score", groupvars="end")
tpmt_end_mean <- ggplot(tpmt_end_sum, aes(x=end, y=score)) +
geom_bar(position=position_dodge(), stat="identity", colour="#999999") +
geom_errorbar(aes(ymin=score-sd, ymax=score+sd), width=0.001, position=position_dodge()) +
ylab("mean abundance") + xlab("variant amino acid")
plot(tpmt_end_mean)

set.seed(153)
jitter <- position_jitter(width = 1, height = NULL)
jitter1 <-position_jitter(width = 0.08, height = NULL)
jitter2 <- position_jitter(width=0.13, height = NULL)
twenty_color = c("#D02028", "#E1A12F", "#EDD941", "#F2F08E", "#EEC898", "#BCDDAE", "#A4C33B", "#76C158", "#85782E", "#315935", "#53958B", "#A1DAE0", "#486EB6", "#E6A3B4", "#C5A0CA", "#554DA0", "#99247E", "#402059", "#82421B", "#7E807E", 'black')
#pten_k_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=start)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "K"), aes(group=factor(position))) + xlab("Position in PTEN") + ggtitle("Lysine variant abundance scores")
pten_k_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "K"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Lysine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "K"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color)
pten_k_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "K"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Lysine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "K"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color)
pten_g_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "G"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Glycine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "G"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color)
pten_g_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "G"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Glycine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "G"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color)
pten_g_hydrodiff <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "G"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Glycine variant abundance scores w/ hydrophobicity change")+ geom_point(data=subset(pten1_proc_wt, start== "G"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_distiller(palette = "Spectral")
pten_c_spread <- ggplot(pten1_proc_wt, aes(y=score, x=position)) + geom_point(data=subset(pten1_proc_wt, start== "C")) + xlab("Position in PTEN") + ggtitle("Cysteine variant abundance scores")
#experiment_orig
#pten_c_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=position, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "C"), aes(group=position%%450), scale = "width") + xlab("Position in PTEN") + ggtitle("Cysteine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "C"), aes(x=position, y=score), alpha = 0.75, position=jitter) + scale_color_manual(values=c("#D02028", "#E1A12F", "#EDD941", "#F2F08E", "#EEC898", "#BCDDAE", "#A4C33B", "#76C158", "#85782E", "#315935", "#53958B", "#A1DAE0", "#486EB6", "#E6A3B4", "#C5A0CA", "#554DA0", "#99247E", "#402059", "#82421B", "#7E807E", 'black'))
#, scale = "count"
#+ geom_point(data=subset(pten1_proc_wt, start== "C"), aes(x=position, y=score), alpha = 0.5)
pten_c_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=position, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "C"), aes(group=position%%450), scale = "width") + xlab("Position in PTEN") + ggtitle("Cysteine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "C"), aes(x=position, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color)
pten_c_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "C"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Cysteine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "C"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color)
pten_c_hydrodiff <- ggplot(pten1_proc_wt, aes(y=score, x=position, colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "C"), aes(group=position%%450), scale = "width") + xlab("Position in PTEN") + ggtitle("Cysteine variant abundance scores w/ hydrophobicity change")+ geom_point(data=subset(pten1_proc_wt, start== "C"), aes(x=position, y=score), alpha = 0.75, position=jitter1) + scale_color_distiller(palette = "Spectral")
#in in geom_violin(aes()) -> colour = hydro1
#pten_s_spread <- ggplot(pten1_proc_wt, aes(y=score, x=position)) + geom_point(data=subset(pten1_proc_wt, start== "S")) + xlab("Position in PTEN") + ggtitle("Serine variant abundance scores")
#pten_s_spread1_old <- ggplot(pten1_proc_wt, aes(y=score, x=start)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=factor(position))) + xlab("Position in PTEN") + ggtitle("Serine variant abundance scores")
pten_s_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Serine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color)
pten_s_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Serine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color)
pten_s_hydrodiff <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Serine variant abundance scores w/ hydrophobicity change")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_distiller(palette = "Spectral")
#graphing abundance vs change in hydrophobicity
pten_s_hh_hydrodiff <- ggplot(pten1_proc_wt, aes(y=score, x=factor(hydro2-hydro1), colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=factor(hydro2-hydro1)), scale = "width") + xlab("Change in hydrophobicity") + ggtitle("Serine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=factor(hydro2-hydro1), y=score), alpha = 0.75, position=jitter1)
#pten_s_aa_hydrodiff <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Serine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=end, y=score), alpha = 0.75, position=jitter1)
#pten_s_aa_voldiff <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = voldiff)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Serine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=end, y=score), alpha = 0.75, position=jitter1)
#pten_s_aa_poldiff <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = polaritydiff)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Serine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=end, y=score), alpha = 0.75, position=jitter1)
#pten_s_aa_weightdiff <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = weightdiff)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Serine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=end, y=score), alpha = 0.75, position=jitter1)
#specific amino acid tests
##plot(pten_k_spread)
##plot(pten_g_spread)
##plot(pten_c_spread)
##plot(pten_s_spread)
##plot(pten_s_spread1_old)
#plot(pten_s_hh_hydrodiff) #probably not very useful... does not take into account position anymore
##plot(pten_s_aa_hydrodiff)
##plot(pten_s_aa_voldiff)
##plot(pten_s_aa_poldiff)
##plot(pten_s_aa_weightdiff)
plot(pten_c_spread1)

plot(pten_c_aa)

plot(pten_c_hydrodiff)

plot(pten_s_spread1)

plot(pten_s_aa)

plot(pten_s_hydrodiff)

plot(pten_g_spread1)

plot(pten_g_aa)

plot(pten_g_hydrodiff)

plot(pten_k_spread1)

plot(pten_k_aa)

pten_a_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "A"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Alanine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "A"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color)
pten_a_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "A"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Alanine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "A"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color)
pten_r_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "R"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Arganine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "R"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color)
pten_r_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "R"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Arganine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "R"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color)
pten_n_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "N"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Asparagine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "N"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color)
pten_n_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "N"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Asparagine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "N"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color)
pten_d_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "D"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Aspartic Acid variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "D"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color)
pten_d_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "D"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Aspartic Acid variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "D"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color)
plot(pten_a_spread1)

plot(pten_a_aa)

plot(pten_r_spread1)

plot(pten_r_aa)

plot(pten_n_spread1)

plot(pten_n_aa)

plot(pten_d_spread1)

plot(pten_d_aa)

VAMP-seq abundance score density plots for PTEN (top) and TPMT (bottom) nonsense variants (blue dashed line), synonymous variants (red dashed line) and missense variants (filled, solid line). The missense variant densities are colored as gradients between the lowest 10% of abundance scores (blue), the WT abundance score (white) and abundance scores above WT (red).
#Identifying items in tail to investigate
pten1_nonsense <- subset(pten1_proc, class == "nonsense")
tpmt1_nonsense <- subset(tpmt1_proc, class == "nonsense")
pten1_synon <- subset(pten1_proc, class == "synonymous")
tpmt1_synon <- subset(tpmt1_proc, class == "synonymous")
pten1_no_missense <- subset(pten1_proc, class == "synonymous" | class == "nonsense")
ggplot(pten1_nonsense, aes(x=score)) + geom_histogram(binwidth=.01, colour="blue", fill="white")

#+ geom_density()
ggplot(pten1_synon, aes(x=score)) + geom_histogram(binwidth=.01, colour="red", fill="white")

ggplot(pten1_proc_wt, aes(x=score)) + geom_histogram(data=subset(pten1_proc_wt,class == "nonsense"), fill = "red", alpha = 0.5, binwidth=.01) + geom_histogram(data=subset(pten1_proc_wt,class == "synonymous"), fill = "blue", alpha = 0.5, binwidth=.01) + geom_histogram(data=subset(pten1_proc_wt,class == "missense"), fill = "green", alpha = 0.2, binwidth=.01)

ggplot(pten1_no_missense, aes(x=score)) + geom_histogram(data=subset(pten1_no_missense,class == "nonsense"), fill = "red", alpha = 0.5, binwidth=.01) + geom_histogram(data=subset(pten1_no_missense,class == "synonymous"), fill = "blue", alpha = 0.5, binwidth=.01)

ggplot(tpmt1_synon, aes(x=score)) + geom_histogram(binwidth=.01, colour="red", fill="white")

ggplot(tpmt1_nonsense, aes(x=score)) + geom_histogram(binwidth=.01, colour="blue", fill="white")

#0.55
nonsense_tail <- subset(pten1_nonsense, score > 0.6)
synon_tail <- subset(pten1_synon, score < 0.6)
nonsense_tail$secondary_struct <- ifelse(is.na(nonsense_tail$helix), "unknown",
ifelse(nonsense_tail$helix==1, "helix",
ifelse(nonsense_tail$sheet==1, "sheet",
ifelse(nonsense_tail$helix==0, "neither",
"unknown"))))
synon_tail$secondary_struct <- ifelse(is.na(synon_tail$helix), "unknown",
ifelse(synon_tail$helix==1, "helix",
ifelse(synon_tail$sheet==1, "sheet",
ifelse(synon_tail$helix==0, "neither",
"unknown"))))
#data[row,column]
n_tail <- nonsense_tail[,c(1,2,7,30,127)]
s_tail <- synon_tail[,c(1,2,7,30,127)]
n_tail$bp_pos <- (n_tail$position-1)*3
s_tail$bp_pos <- (s_tail$position-1)*3
n_tail
s_tail
#just in case there is a discernible pattern
s_tail_pos <- ggplot(s_tail, aes(x=position, y=score, colour=secondary_struct))+ geom_point(size=.3) + scale_x_continuous(minor_breaks = seq(0, 405, 5)) + scale_color_manual(values=c("#FF4848", "#00C853", "#5757FF", "#A9A9A9")) +ylab("VAMP-seq score")+xlab("Position in PTEN")+labs(colour="Secondary Structure")+ggtitle("PTEN synonymous variant tail scores in relation to protein structure") + geom_vline(xintercept=47, color="black", size=.1) + geom_vline(xintercept=78, color="black", size=.1) + geom_vline(xintercept=122.5, color="black", size=.1) + geom_vline(xintercept=140, color="black", size=.1) + geom_vline(xintercept=165, color="black", size=.1) + geom_vline(xintercept=194, color="black", size=.1) + geom_vline(xintercept=209, color="black", size=.1)
plot(s_tail_pos)

#help visualizing NMD rules
n_tail_pos <- ggplot(n_tail, aes(x=position, y=score, colour=secondary_struct))+ geom_point(size=.3) + scale_x_continuous(minor_breaks = seq(0, 405, 5)) + scale_color_manual(values=c("#FF4848", "#00C853", "#5757FF", "#A9A9A9")) +ylab("VAMP-seq score")+xlab("Position in PTEN")+labs(colour="Secondary Structure")+ggtitle("PTEN nonsense variant tail scores in relation to protein structure") + geom_vline(xintercept=47, color="black", size=.1) + geom_vline(xintercept=78, color="black", size=.1) + geom_vline(xintercept=122.5, color="black", size=.1) + geom_vline(xintercept=140, color="black", size=.1) + geom_vline(xintercept=165, color="black", size=.1) + geom_vline(xintercept=194, color="black", size=.1) + geom_vline(xintercept=209, color="black", size=.1)
plot(n_tail_pos)

#reversing data to fit tpmt1_data
rever <- function(df=tpmt_ruddle_data){df<-df[dim(df)[1]:1,]}
tpmt_ruddle_data_rev = rever(tpmt_ruddle_data)
#creating variant column, equiv to tpmt1_data's
tpmt_ruddle_data_rev$variant <- do.call(paste, c(tpmt_ruddle_data_rev[c(5,24,6)], sep=""))
#making both tables smaller
tpmt_essential <- tpmt_ruddle_data_rev[,c(2,3,4,5,6,17,19,24,27,28,29,30,31,32,33,34,35,76,77,78,137)]
tpmt1_proc_ess <- tpmt1_proc_wt[,c(1,2,3,5,6,7,30,32,80)]
#merging tables with variant name
tpmt_merge <- merge(tpmt1_proc_ess, tpmt_essential, by="variant")
#comparing abundance scores with various scores in dbNSFP (contains annotations of all potential non-synonymous single-nucleotide variants (nsSNVs) in the human genome)
tpmt_cor1 <- ggplot(tpmt_merge, aes(x=score, y=as.numeric(SIFT_score)))+ geom_point(alpha = 0.2) + xlab("VAMP-seq score")+ylab("SIFT score")+ggtitle("1")
tpmt_cor1.5 <- ggplot(tpmt_merge, aes(x=score, y=as.numeric(SIFT_converted_rankscore)))+ geom_point(alpha = 0.2) + xlab("VAMP-seq score")+ylab("SIFT converted rankscore")+ggtitle("1.5")
tpmt_cor5 <- ggplot(tpmt_merge, aes(x=score, y=CADD_raw_rankscore))+ geom_point(alpha = 0.2) + xlab("VAMP-seq score")+ylab("CADD raw rankscore")+ggtitle("5")
tpmt_cor2 <- ggplot(tpmt_merge, aes(x=score, y=as.numeric(Polyphen2_HDIV_score)))+ geom_point(alpha = 0.2) + xlab("VAMP-seq score")+ylab("Polyphen2 HDIV score")+ggtitle("2")
tpmt_cor3 <- ggplot(tpmt_merge, aes(x=score, y=as.numeric(Polyphen2_HVAR_score)))+ geom_point(alpha = 0.2) + xlab("VAMP-seq score")+ylab("Polyphen2 HVAR score")+ggtitle("3")
tpmt_cor2.5 <- ggplot(tpmt_merge, aes(x=score, y=as.numeric(Polyphen2_HDIV_rankscore)))+ geom_point(alpha = 0.2) + xlab("VAMP-seq score")+ylab("Polyphen2 HDIV rankscore")+ggtitle("2.5")
tpmt_cor3.5 <- ggplot(tpmt_merge, aes(x=score, y=as.numeric(Polyphen2_HVAR_rankscore)))+ geom_point(alpha = 0.2) + xlab("VAMP-seq score")+ylab("Polyphen2 HVAR rankscore")+ggtitle("3.5")
#CADD_phred not worth
#plot(tpmt_cor5)
#plot(tpmt_cor1)
#plot(tpmt_cor1.5)
plot(tpmt_cor2)

plot(tpmt_cor3)

plot(tpmt_cor2.5)

plot(tpmt_cor3.5)

TPMT_abun_CADD <- ggplot(tpmt_merge, aes(x=abundance_class, y=CADD_raw_rankscore)) + geom_violin(draw_quantiles = c( 0.5))+ylab("CADD raw rankscore")+xlab("Abundance Class")
plot(TPMT_abun_CADD)

TPMT_abun_SIFT_conv <- ggplot(tpmt_merge, aes(x=abundance_class, y=as.numeric(SIFT_converted_rankscore))) + geom_violin(draw_quantiles = c(0.5))+ylab("SIFT conv rankscore")+xlab("Abundance Class")
plot(TPMT_abun_SIFT_conv)

TPMT_abun_POLY <- ggplot(tpmt_merge, aes(x=abundance_class, y=as.numeric(Polyphen2_HDIV_rankscore))) + geom_violin(draw_quantiles = c( 0.5))+ylab("Polyphen2 HDIV rankscore")+xlab("Abundance Class")
plot(TPMT_abun_POLY)

TPMT_abun_POLY1 <- ggplot(tpmt_merge, aes(x=abundance_class, y=as.numeric(Polyphen2_HVAR_rankscore))) + geom_violin(draw_quantiles = c( 0.5))+ylab("Polyphen2 HVAR rankscore")+xlab("Abundance Class")
plot(TPMT_abun_POLY1)

Pred_abun_SIFT <- ggplot(tpmt_merge, aes(abundance_class)) + geom_bar(aes(fill = SIFT_pred)) + ggtitle("Abundance class vs SIFT prediction of Damaging or Tolerated")
plot(Pred_abun_SIFT)

trial_sep <- tpmt_merge[c(21,23,24,26)]
tpmt_merge_expand <- separate_rows(tpmt_merge, c("Polyphen2_HDIV_score", "Polyphen2_HDIV_pred", "Polyphen2_HVAR_score", "Polyphen2_HVAR_pred"))
Pred_abun_HVAR <- ggplot(tpmt_merge_expand, aes(abundance_class)) + geom_bar(aes(fill = Polyphen2_HVAR_pred)) + ggtitle("Abundance class vs Polyphen2 HVAR predictions") + labs(subtitle = "D: Probably Damaging, P: Possibly Damaging, B: Benign")
plot(Pred_abun_HVAR)

#creation of b-score text files for pymol use
pten_pymol <- summarySE(pten1_data, measurevar="score", groupvars="position", na.rm=TRUE)
NaNs produced
#score[404] is wt
write.table(pten_pymol$score[1:403], "pten_mean_scores_pymol.txt", sep="\n", row.names=F, col.names=F, na = "NaN")
tpmt_pymol <- summarySE(tpmt1_data, measurevar="score", groupvars="position", na.rm=TRUE)
NaNs produced
#score[404] is wt
write.table(tpmt_pymol$score[1:245], "tpmt_mean_scores_pymol.txt", sep="\n", row.names=F, col.names=F, na = "NaN")
---
title: "PTEN R Notebook"
output: html_notebook
---
```{r global_options, include=FALSE}
library(knitr)
library(ggplot2)
require(gridExtra)
library(reshape2)
library(pracma)
library(ggbeeswarm)
library(Rmisc)
library(grid)
library(EBImage)
library(googlesheets)
library(tidyr)
library(dplyr)
knitr::opts_chunk$set(fig.width=12, fig.height=12, warning=FALSE)
```
Multiplex assessment of protein variant abundance by massively parallel sequencing
VAMP-seq
- multiplex assay that uses fluorescent reporters to measure the steady-state abundance of protein variants in cultured human cells (each cellexpresses a single variant directly fused to EGFP...the stability of the variant dictates the abundance of the EGFP fusion and, accordingly, the green fluorescence signal of the cell)
- used to assess PTEN and TPMT variants 
```{r include=FALSE}
par(pch=20, cex=.6)
pten1_data <- read.delim('~/leklab/leklab/pten1.txt')
pten1_proc <- pten1_data[!is.na(pten1_data$abundance_class),]
dd <- data.frame(pten1_proc$abundance_class,pten1_proc$score)
colnames(dd) <- c("abundance_class", "score")
tpmt1_data <- read.delim('~/leklab/leklab/tpmt_suppl_2.txt')
tpmt1_proc <- tpmt1_data[!is.na(tpmt1_data$abundance_class),]
ee <- data.frame(tpmt1_proc$abundance_class,tpmt1_proc$score)
colnames(ee) <- c("abundance_class", "score")
dd$protein <- rep("PTEN", nrow(dd))
ee$protein <- rep("TPMT", nrow(ee))
ff = data.frame(rbind(dd, ee))
bbpp = boxplot(score~protein+abundance_class, data = ff, at = c(1, 1.8, 3, 3.8, 5, 5.8, 7.2, 8), xaxt='n', col = c('white', 'gray'))
axis(side=1, at=c(1.4, 3.4, 5.4, 7.6), labels=c('low', 'possibly low', 'possibly\n wt-like', 'wt-like'))
title('VAMP-seq scores of PTEN and TPMT Variants\nand abundance class')


#plot(x = pten1_proc$abundance_class, y = pten1_proc$score,type='p', main = "PTEN", xlab = "Abundance", ylab = "VAMP-seq score", col="#74ABD6")
#points(x = tpmt1_proc$abundance_class, y = tpmt1_proc$score, type='p', col = "#ADDFAD")
```
```{r include=FALSE}

# d <- read.table(text = "col_a col_b 
#                         aa    1
#                         ba    1.25
#                         ba    1
#                         ba    1.25
#                         ca    1.3
#                         ca    1.25
#                         da    1.5
#                         da    1.25
#                         aa    1.7
#                         ca    1.25
#                         ba    1.2
#                         da    1.25
#                         aa    1.4
#                         aa    1.25
#                         ca    1.1
#                         aa    1.25", 
#                 header = TRUE,)
# e <- read.table(text = "col_a col_b 
#                         aa    1.6
#                         aa    1.55
#                         ba    1.2
#                         ba    1.45
#                         ca    1.8
#                         ca    1.55
#                         da    1.5
#                         da    1.35
#                         aa    1.9
#                         ca    1.75
#                         ba    1.25
#                         da    1.55
#                         aa    1.45
#                         aa    1.5
#                         ca    1.3
#                         aa    1.75", 
#                 header = TRUE,)
# d$label <- rep(1, nrow(d))
# e$label <- rep(2, nrow(e))
# f = data.frame(rbind(d, e))
# ##f$col_a = pollutant
# ##f$label = location
# bp = boxplot(col_b~label+col_a, data = f, at = c(1, 1.8, 3, 3.8, 5, 5.8, 7.2, 8), xaxt='n', ylim = c(.9, 1.9), col = c('white', 'gray'))
# axis(side=1, at=c(1.4, 3.4, 5.4, 7.6), labels=c('aa', 'ba', 'ca', 'da'), title('practice'))
```
```{r}
#plots VAMP-seq score vs abundance_class

VAMP_abundance <- ggplot(ff, aes(x=abundance_class, y=score, fill=protein)) + geom_violin(draw_quantiles = 0.5)+ylab("VAMP-seq score")+xlab("Abundance Class")+theme(legend.title=element_blank(), panel.grid.major = element_line(colour = "grey"), panel.grid.minor = element_line(colour = "grey"))+ggtitle("VAMP-seq scores for each abundance classification")+geom_point(data=data.frame(x="wt-like", y=1, protein = "PTEN"), aes(x,y), colour="black", size=1.5, show.legend=FALSE)+annotate("text", x = "wt-like", y=1.09, label = "WT",colour= "black", size = 4) + scale_y_continuous(minor_breaks = seq(-2, 2, .25))+theme_bw()
plot(VAMP_abundance)
```
```{r include=FALSE}
#plots helix vs score for PTEN
#ggplot(pten1_data, aes(x=as.factor(helix), y=score)) + geom_boxplot()+ylab("VAMP-seq score")
```
```{r include=FALSE}
#combining pten1_data and tpmt1_data into one large data frame, differentiate between the two w/ column 'protein' which specifies 'PTEN' or 'TPMT'
pten1_data$protein <- rep("PTEN", nrow(pten1_data))
tpmt1_data$protein <- rep("TPMT", nrow(tpmt1_data))
common_cols <- intersect(colnames(pten1_data), colnames(tpmt1_data))
comb_data = rbind(subset(pten1_data, select = common_cols), subset(tpmt1_data, select = common_cols))

#plots helix vs score for PTEN and TPMT side by side
#no NA

comb_data_helix <- comb_data[!is.na(comb_data$helix),]
#check to see where 3759 rows went off to
ck <- comb_data_helix[!is.na(comb_data_helix$abundance_class),]
comb_data_sheet <- comb_data[!is.na(comb_data$sheet),]
ck1 <- comb_data_sheet[!is.na(comb_data_sheet$abundance_class),]

h_plot <- ggplot(ck, aes(x=as.factor(helix), y=score, fill=protein)) + geom_violin(data=subset(ck, helix==1), draw_quantiles = c(0.5)) + guides(fill=FALSE) + xlab("Alpha Helix") + ylab("VAMP-seq score") + theme(axis.text.x = element_blank()) + scale_y_continuous(limits = c(-.7, 2.03))

s_plot <- ggplot(ck1, aes(x=as.factor(sheet), y=score, fill=protein)) + geom_violin(data=subset(ck1, sheet==1), draw_quantiles = c(0.5)) +  theme(axis.title.y = element_blank(), axis.text.y = element_blank(), axis.text.x = element_blank()) + xlab("Beta Sheet") + scale_y_continuous(limits = c(-.7, 2.03)) + guides(fill=FALSE) 

n_plot <- ggplot(ck, aes(x=as.factor(helix), y=score, fill=protein)) + geom_violin(data=subset(ck, helix==0 & sheet==0), draw_quantiles = c( 0.5)) + theme( axis.title.y = element_blank(), axis.text.y = element_blank(), axis.text.x = element_blank(), legend.justification=c(1,0), legend.position=c(.49,.75), legend.title=element_blank(), legend.text = element_text(size=10)) + xlab("Other") + scale_y_continuous(limits = c(-.7, 2.03))

#put the plots side by side
combined <- grid.arrange(h_plot, s_plot, n_plot, ncol=3, top = "Variant scores in relation to position in protein")
##############
##save as pdf

# pdf("violin_Variant_scores_vs.pdf")
# plot(combined)
# plot(VAMP_abundance)
# dev.off()
##############
#works to save single
#ggsave("Variant_scores_protein_position.pdf", plot = combined, device = "pdf", path = "/Users/go2alyssa/Desktop/", scale = 2.6, dpi = "retina")

```

```{r}
# graph VAMP-seq scores relative to variant position in protein
#pten
pten1_proc_wt <- pten1_proc[!is.na(pten1_proc$position),]
pten1_proc_wt$secondary_struct <- ifelse(is.na(pten1_proc_wt$helix), "unknown",
                        ifelse(pten1_proc_wt$helix==1, "helix",
                        ifelse(pten1_proc_wt$sheet==1, "sheet",
                        ifelse(pten1_proc_wt$helix==0, "neither",
                        "unknown"))))
pten_pos <- ggplot(pten1_proc_wt, aes(x=position, y=score, colour=secondary_struct))+ geom_point(size=.3) + scale_x_continuous(minor_breaks = seq(0, 420, 5)) + scale_color_manual(values=c("#FF4848", "#00C853", "#5757FF", "#A9A9A9")) +ylab("VAMP-seq score")+xlab("Position in PTEN")+labs(colour="Secondary Structure")+ggtitle("PTEN scores in relation to protein structure") + geom_vline(xintercept=27, color="black", size=.1) + geom_vline(xintercept=55, color="black", size=.1) + geom_vline(xintercept=70, color="black", size=.1) + geom_vline(xintercept=85, color="black", size=.1) + geom_vline(xintercept=164.5, color="black", size=.1) + geom_vline(xintercept=212, color="black", size=.1) + geom_vline(xintercept=267.5, color="black", size=.1) + geom_vline(xintercept=343.5, color="black", size=.1)+theme_bw()

pten_hydro <- ggplot(pten1_proc_wt, aes(x=position, y=score, colour=(hydro2-hydro1)))+ geom_point(size=.3, alpha = 0.3) + scale_x_continuous(minor_breaks = seq(0, 420, 5)) + ylab("VAMP-seq score")+xlab("Position in PTEN")+labs(colour="Hydrophobicity")+ggtitle("PTEN scores in relation to change in hydrophobicity") + geom_vline(xintercept=27, color="black", size=.1) + geom_vline(xintercept=55, color="black", size=.1) + geom_vline(xintercept=70, color="black", size=.1) + geom_vline(xintercept=85, color="black", size=.1) + geom_vline(xintercept=164.5, color="black", size=.1) + geom_vline(xintercept=212, color="black", size=.1) + geom_vline(xintercept=267.5, color="black", size=.1) + geom_vline(xintercept=343.5, color="black", size=.1)


#tpmt
tpmt1_proc_wt <- tpmt1_proc[!is.na(tpmt1_proc$position),]
tpmt1_proc_wt$secondary_struct <- ifelse(is.na(tpmt1_proc_wt$helix), "unknown",
                        ifelse(tpmt1_proc_wt$helix==1, "helix",
                        ifelse(tpmt1_proc_wt$sheet==1, "sheet",
                        ifelse(tpmt1_proc_wt$helix==0, "neither",
                        "unknown"))))
tpmt_pos <- ggplot(tpmt1_proc_wt, aes(x=position, y=score, colour=secondary_struct))+ geom_point(size=.3) + scale_x_continuous(minor_breaks = seq(0, 405, 5)) + scale_color_manual(values=c("#FF4848", "#00C853", "#5757FF", "#A9A9A9")) +ylab("VAMP-seq score")+xlab("Position in TPMT")+labs(colour="Secondary Structure")+ggtitle("TPMT scores in relation to protein structure") + geom_vline(xintercept=47, color="black", size=.1) + geom_vline(xintercept=78, color="black", size=.1) + geom_vline(xintercept=122.5, color="black", size=.1) + geom_vline(xintercept=140, color="black", size=.1) + geom_vline(xintercept=165, color="black", size=.1) + geom_vline(xintercept=194, color="black", size=.1) + geom_vline(xintercept=209, color="black", size=.1)+theme_bw()

tpmt_colors <- tpmt1_proc_wt
#[order(position, variant),]
tpmt_colors$fact <- rep(10, nrow(tpmt_colors))
temp <- 1
for(i in 1:(length(tpmt_colors$fact)-1)) {
  if (tpmt_colors$secondary_struct[i] != tpmt_colors$secondary_struct[i+1]) {
    tpmt_colors$fact[i] <- temp
    temp <- temp + 1
  } else {
  tpmt_colors$fact[i] <- temp
  }
}
tpmt_colors$fact[length(tpmt_colors$fact)] <- temp

# cc <- 0
# for(i in 1:(length(tpmt_colors$fact)-1)) {
#   if (tpmt_colors$fact[i] != tpmt_colors$fact[i+1]) {
#     print(cc)
#     cc <- 0
#   } else {
#     cc <- cc + 1
#   }
# }

tpmt_pos_vp <- ggplot(tpmt_colors, aes(x=position, y=score))+ geom_violin(data=tpmt_colors[c(1:2783, 2798:4000),], aes(fill=as.character(fact), colour = factor(TRUE)), draw_quantiles = c(0.5), scale = "width") + 
scale_fill_manual(values=c("1" = "#A9A9A9", "2" = "#00C853", "3" = "#FF4848", "4" = "#00C853","5" = "#FF4848", "6" = "#00C853","7" = "#5757FF", "8" = "#00C853","9" = "#FF4848","10" = "#00C853","11" = "#5757FF", "12" = "#00C853","13" = "#FF4848", "14" = "#00C853", "15" = "#5757FF", "16" = "#00C853", "17" = "#5757FF", "18" = "#00C853", "19" = "#5757FF", "20" = "#00C853", "21" = "#5757FF", "22" = "#00C853", "23" = "#FF4848", "24" = "#5757FF", "25" = "#00C853", "26" = "#FF4848", "27" = "#00C853", "28" = "#5757FF", "29" = "#00C853", "30" = "#5757FF", "31" = "#00C853")) + scale_colour_manual(values = c("black")) + scale_x_continuous(minor_breaks = seq(0, 405, 5)) + ylab("VAMP-seq score")+xlab("Position in TPMT")+labs(colour="Secondary Structure")+ggtitle("TPMT scores in relation to protein structure") + geom_vline(xintercept=47, color="black", size=.1) + geom_vline(xintercept=78, color="black", size=.1) + geom_vline(xintercept=122.5, color="black", size=.1) + geom_vline(xintercept=140, color="black", size=.1) + geom_vline(xintercept=165, color="black", size=.1) + geom_vline(xintercept=194, color="black", size=.1) + geom_vline(xintercept=209, color="black", size=.1)


pten_hydro1 <- ggplot(pten1_proc_wt, aes(y=score, x=(hydro2-hydro1)))+ geom_point(size=0.5, alpha = 0.3) + ylab("Hydrophobicity")+xlab("VAMP-seq score")+ggtitle("PTEN scores in relation to change in hydrophobicity")

pten_aa_spread <- ggplot(pten1_proc_wt[2:288,], aes(y=score, x=start)) + geom_violin(draw_quantiles=c(0.5))
pten_aa_spread1 <- ggplot(pten1_proc_wt[2:288,], aes(y=score, x=start)) + geom_point(size = 0.5)


plot(pten_pos)
plot(tpmt_pos)

#plot(tpmt_pos_vp)
#plot(pten_hydro)
#plot(pten_hydro1)
#plot(pten_aa_spread)
#plot(pten_aa_spread1)
```
```{r}
# graph VAMP-seq scores relative to variant position in protein
#pten
pten1_hbond <- pten1_proc[!is.na(pten1_proc$hbond_sum),]
pten1_hbond$secondary_struct <- ifelse(is.na(pten1_hbond$helix), "unknown",
                        ifelse(pten1_hbond$helix==1, "helix",
                        ifelse(pten1_hbond$sheet==1, "sheet",
                        ifelse(pten1_hbond$helix==0, "neither",
                        "unknown"))))
pten_plot_hbond <- ggplot(pten1_hbond, aes(x=hbond_sum, y=score, colour=secondary_struct))+ geom_point(alpha=0.4) + ylab("VAMP-seq score")+xlab("DSSP Sum of hydrogen bonds")+ggtitle("PTEN scores in relation to hydrogen bonding") + scale_color_manual(values=c("#FF4848", "#696969", "#5757FF")) + labs(colour="Secondary Structure")
plot(pten_plot_hbond)

pten_plot_hbond1 <- ggplot(pten1_hbond, aes(x=hbond_sum, y=score))+ geom_point(alpha = 0.2) + ylab("VAMP-seq score")+xlab("DSSP Sum of hydrogen bonds")+ggtitle("PTEN scores in relation to hydrogen bonding")

# was in aes, ggplot function call ---> colour=secondary_struct
#scale_color_manual(values=c("#FF4848", "#00C853", "#5757FF", "#A9A9A9")) + labs(colour="Secondary Structure")+

plot(pten_plot_hbond1)
#less hydrogen bonds ~ higher abundance
```
```{r}
#TPMT
tpmt_sum <- summarySE(tpmt1_proc_wt, measurevar="score", groupvars="position")
#head(tpmt_sum)
tpmt_pos_mean <- ggplot(tpmt_sum, aes(x=position, y=score))+ geom_bar(position=position_dodge(), stat="identity", colour="#999999") + geom_errorbar(aes(ymin=score-sd, ymax = score+sd), width=1, size=0.3, position=position_dodge()) +ylab("VAMP-seq score")+theme(axis.title.x = element_blank(), axis.text.x = element_blank()) + scale_x_continuous(breaks = seq(0, 245, 10), expand = c(0,0)) + scale_y_continuous(expand = c(0,0))
#factor(position) is getting rid of some positions altogether on the graph
#tpmt_pos_mean <- ggplot(tpmt_sum, aes(x=factor(position), y=score))+ geom_bar(position=position_dodge(), stat="identity", colour="#999999") + geom_errorbar(aes(ymin=score-sd, ymax = score+sd), width=0.001, position=position_dodge()) +ylab("VAMP-seq score")+theme(axis.title.x = element_blank(), axis.text.x = element_blank()) + scale_x_discrete(breaks = seq(0, 250, 10))
tpmt_heat <- ggplot(tpmt1_proc_wt, aes(position, end)) + geom_tile(aes(fill=score)) + scale_fill_gradientn(colours = c("#3F7CB9", "#FFEAF3", "#B21F4E"), values = scales::rescale(c(-0.7, 0.2, 1, 1.3, 2.03)))+ scale_x_continuous(breaks = seq(0, 245, 10), expand=c(0,0)) + theme(legend.position='bottom')+xlab("Position in TPMT") + scale_y_discrete(expand = c(0,0))
#scale_fill_gradient2(low="#3F7CB9", mid="white", high="#B21F4E", midpoint=1) 
#scale_fill_distiller(palette= "RdBu")
tpmt_dssp_schematic <- ggplot() + ggtitle("TPMT mean abundance scores") +
  geom_segment(aes(x = 1, y = 0, xend = max(tpmt1_data$position)), yend = 0, size = 1, color = "grey70") +
  geom_point(data = tpmt1_data, aes(x = position, y = 0), color = "black", size = 1.8) +
  geom_point(data = subset(tpmt1_data, sheet == 1), aes(x = position, y = 0), color = "pink", size = 1.5) +
  geom_point(data = subset(tpmt1_data, helix == 1), aes(x = position, y = 0), color = "cyan", size = 1.5) +
  scale_x_continuous(breaks = seq(0, 245, 10), expand = c(0,0)) +
  scale_y_continuous(breaks = NULL, expand = c(0,0)) +xlab(NULL) + ylab(NULL) +
  theme(panel.border = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank())

#plots
plot(tpmt_pos_mean)
plot(tpmt_heat)
tpmt_dssp_schematic

#+ geom_vline(xintercept=185, color="black", size=.1) + geom_vline(xintercept=350, color="black", size=.1)

#grouping all variants in the same secondary structure together
tpmt_aa_sum <- summarySE(tpmt_colors, measurevar="score", groupvars="fact")
tpmt_aa_mean <- ggplot(tpmt_aa_sum, aes(x=fact, y=score))+ geom_bar(position=position_dodge(), stat="identity", colour="#999999") + geom_errorbar(aes(ymin=score-sd, ymax = score+sd), width=1, size=0.3, position=position_dodge()) +ylab("VAMP-seq score")+theme(axis.text.x = element_blank()) + scale_x_discrete(breaks = seq(0, 245, 10)) + xlab("each bar is a different secondary structure")
plot(tpmt_aa_mean)
```
```{r}
#PTEN
pten_sum <- summarySE(pten1_proc_wt, measurevar="score", groupvars="position")
#head(pten_sum)
pten_pos_mean <- ggplot(pten_sum, aes(x=position, y=score))+ geom_bar(position=position_dodge(), stat="identity", colour="#999999") + geom_errorbar(aes(ymin=score-sd, ymax = score+sd), width=1, size=0.3, position=position_dodge()) +ylab("VAMP-seq score")+theme(axis.title.x = element_blank(), axis.text.x = element_blank()) + scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) + scale_y_continuous(expand = c(0,0)) + geom_vline(xintercept=185, color="black", size=.1) + geom_vline(xintercept=350, color="black", size=.1)
pten_heat <- ggplot(pten1_proc_wt, aes(position, end)) + geom_tile(aes(fill=score)) + scale_fill_gradientn(colours = c("#3F7CB9", "#FFEAF3", "#B21F4E"), values = scales::rescale(c(-0.23, 0.42, 1, 1.2, 1.47)))+ scale_x_continuous(breaks = seq(0, 403, 20), expand=c(0,0)) + theme(legend.position='bottom')+xlab("Position in TPMT") + scale_y_discrete(expand = c(0,0))
#c(-0.7, 0.2, 1, 1.3, 2.03)
#c(-0.23, 0.42, 1, 1.2, 1.47)
pten_extra <- read.table(file = '~/leklab/leklab/PTEN_positional_data.tsv', sep = '\t', header = TRUE)
pten_dssp_schematic <- ggplot() + ggtitle("PTEN mean abundance scores") +
  geom_segment(aes(x = 1, y = 0, xend = max(pten_extra$position)), yend = 0, size = 1, color = "grey70") +
  geom_point(data = subset(pten_extra, !is.na(xca)), aes(x = position, y = 0), color = "black", size = 1.8) +
  geom_point(data = subset(pten_extra, sheet == 1), aes(x = position, y = 0), color = "pink", size = 1.5) +
  geom_point(data = subset(pten_extra, helix == 1), aes(x = position, y = 0), color = "cyan", size = 1.5) +
  scale_x_continuous(breaks = seq(0, 403, 20), expand = c(0,0)) +
  scale_y_continuous(breaks = NULL, expand = c(0,0)) +xlab(NULL) + ylab(NULL) +
  theme(panel.border = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank())

#plots
plot(pten_pos_mean)
plot(pten_heat)
pten_dssp_schematic

```


```{r include=FALSE}
#...pointless to represent data this way ...

#dbNSFP
#setup
tpmt_merge_copy <- tpmt_merge
tpmt_merge_expand_copy <- tpmt_merge_expand
#tpmt_merge_copy$SIFT_converted_rankscore[which(tpmt_merge_copy$SIFT_score == '.')] = NA
tpmt_merge_copy$SIFT_score <- as.numeric(tpmt_merge_copy$SIFT_score)
tpmt_merge_copy$SIFT_converted_rankscore <- as.numeric(tpmt_merge_copy$SIFT_converted_rankscore)
tpmt_merge_expand_copy$Polyphen2_HVAR_score <- as.numeric(tpmt_merge_expand_copy$Polyphen2_HVAR_score)

#sift_sum
tpmt_sift_sum <- summarySE(tpmt_merge_copy, measurevar="SIFT_score", groupvars="position")
tss_plot <- ggplot(tpmt_sift_sum, aes(x=position, y=SIFT_score))+ geom_bar(position=position_dodge(), stat="identity", colour="#999999") + geom_errorbar(aes(ymin=SIFT_score-sd, ymax =SIFT_score+sd), width=1, size=0.3, position=position_dodge()) +ylab("dfNSFP SIFT score")+theme(axis.title.x = element_blank(), axis.text.x = element_blank()) + scale_x_continuous(breaks = seq(0, 245, 10), expand = c(0,0)) + scale_y_continuous(expand = c(0,0))
#plot(tss_plot)

#HVAR_sum
tpmt_hvar_sum <-summarySE(tpmt_merge_expand_copy, measurevar="Polyphen2_HVAR_score", groupvars="position")
ths_plot <- ggplot(tpmt_hvar_sum, aes(x=position, y=Polyphen2_HVAR_score))+ geom_bar(position=position_dodge(), stat="identity", colour="#999999") + geom_errorbar(aes(ymin=Polyphen2_HVAR_score-sd, ymax =Polyphen2_HVAR_score+sd), width=1, size=0.3, position=position_dodge()) +ylab("dfNSFP HVAR score")+theme(axis.title.x = element_blank(), axis.text.x = element_blank()) + scale_x_continuous(breaks = seq(0, 245, 10), expand = c(0,0)) + scale_y_continuous(expand = c(0,0))
#plot(ths_plot)
```

```{r}
#method1 (basic, for visualizing in rstudio)
grid.newpage()
grid.draw(rbind(ggplotGrob(tpmt_dssp_schematic), ggplotGrob(tpmt_pos_mean), ggplotGrob(tpmt_heat), size = "last"))

grid.newpage()
grid.draw(rbind(ggplotGrob(pten_dssp_schematic), ggplotGrob(pten_pos_mean), ggplotGrob(pten_heat), size = "last"))

#method2 (use for final layout, size specification, download)
gA=ggplot_gtable(ggplot_build(tpmt_pos_mean))
gB=ggplot_gtable(ggplot_build(tpmt_heat))
gC=ggplot_gtable(ggplot_build(tpmt_dssp_schematic))
ga=ggplot_gtable(ggplot_build(pten_pos_mean))
gb=ggplot_gtable(ggplot_build(pten_heat))
gc=ggplot_gtable(ggplot_build(pten_dssp_schematic))
maxWidth = grid::unit.pmax(gA$widths, gB$widths, gC$widths, ga$widths, gb$widths, gc$widths)
gA$widths <- as.list(maxWidth)
gB$widths <- as.list(maxWidth)
gC$widths <- as.list(maxWidth)
ga$widths <- as.list(maxWidth)
gb$widths <- as.list(maxWidth)
gc$widths <- as.list(maxWidth)

grid.newpage()

#storing, with specified widths!!
pdf('pten_tpmt_mean_heat.pdf', width=8, height=6)
grid.arrange(arrangeGrob(gC,gA,gB,nrow=3,heights=c(.1,.3,.8)))
grid.arrange(arrangeGrob(gc,ga,gb,nrow=3,heights=c(.1,.3,.8)))
dev.off()
```
```{r include=FALSE}
##attempt to order by abundance...
#tpmt_sum_o <- tpmt_sum[order(tpmt_sum$score),]
#tpmt_sum_o$position <- factor(tpmt_sum_o$position, levels=tpmt_sum_o$position[order(tpmt_sum_o$score)])
#tpmt_pos_mean_o <- ggplot(tpmt_sum_o, aes(x=factor(position), y=score))+ geom_bar(position=position_dodge(), stat="identity", colour="#999999") + geom_errorbar(aes(ymin=score-sd, ymax = score+sd), width=0.001, position=position_dodge()) +ylab("VAMP-seq score")+theme(axis.title.x = element_blank(), axis.text.x = element_blank()) + scale_x_discrete(breaks = seq(0, 250, 10))
#aaa <- tpmt1_proc_wt
#aaa$means <- tpmt_sum[match(aaa$position, tpmt_sum$position),2]
#bbb <- aaa[order(aaa$mean),]
#bbb$position <- as.factor(bbb$position)
#tpmt_heat_o <- ggplot(bbb, aes(x=factor(position), y=end)) + geom_tile(aes(fill=score)) + scale_fill_gradientn(colours = c("#3F7CB9", "#FFEAF3", "#B21F4E"), values = scales::rescale(c(-0.7, 0.2, 1, 1.3, 2.03)))+ scale_x_discrete(breaks = seq(0, 250, 10)) + theme(legend.position='bottom')+xlab("Position in TPMT")

#aaa$position <- factor(aaa$position, levels=unique(aaa$position)[order(aaa$means)])
#tpmt_heat_o <- ggplot(aaa, aes(x=factor(mean), y=end)) + geom_tile(aes(fill=score)) + scale_fill_gradientn(colours = c("#3F7CB9", "#FFEAF3", "#B21F4E"), values = scales::rescale(c(-0.7, 0.2, 1, 1.3, 2.03)))+ scale_x_discrete(breaks = seq(0, 250, 10)) + theme(legend.position='bottom')+xlab("Position in TPMT")
#tpmt_dssp_schematic <- ggplot() + ggtitle("TPMT mean abundance scores") +
#  geom_segment(aes(x = 1, y = 0, xend = max(tpmt1_data$position)), yend = 0, size = 1, color = "grey70") +
#  geom_point(data = tpmt1_data, aes(x = position, y = 0), color = "black", size = 1.8) +
#  geom_point(data = subset(tpmt1_data, sheet == 1), aes(x = position, y = 0), color = "pink", size = 1.5) +
#  geom_point(data = subset(tpmt1_data, helix == 1), aes(x = position, y = 0), color = "cyan", size = 1.5) +
#  scale_x_continuous(breaks = seq(0, 250, 10), expand = c(0,0)) +
#  scale_y_continuous(breaks = NULL, expand = c(0,0)) +xlab(NULL) + ylab(NULL) +
#  theme(panel.border = element_blank(), axis.text.x = element_blank(), axis.text.y = element_blank())
#tpmt_dssp_schematic
#plot(tpmt_pos_mean_o)
#plot(tpmt_heat_o)
```
```{r}
#plotting mean score vs variant changed to 
tpmt_end_sum <- summarySE(tpmt1_proc_wt, measurevar="score", groupvars="end")
tpmt_end_mean <- ggplot(tpmt_end_sum, aes(x=end, y=score)) +
  geom_bar(position=position_dodge(), stat="identity", colour="#999999") +
  geom_errorbar(aes(ymin=score-sd, ymax=score+sd), width=0.001, position=position_dodge()) +
  ylab("mean abundance") + xlab("variant amino acid")
plot(tpmt_end_mean)

```
```{r include=FALSE}
#plotting scores vs end colored by location/secondary structure
#tpmt_end_scores_b <- ggplot(data=subset(tpmt1_proc_wt, sheet==1), aes(x=end, y=score, colour=position)) +
#  geom_violin(data=subset(tpmt1_proc_wt, sheet==1), draw_quantiles=0.5, scale = "width") +
#  geom_point(data=subset(tpmt1_proc_wt, sheet==1), size=.3, alpha = 0.6, position=jitter2) + ylab("abundance") + xlab("variant aa") +
#  scale_y_continuous(limits = c(-0.8, 2.2))

#tpmt_end_scores_a <- ggplot(data=subset(tpmt1_proc_wt, helix==1), aes(x=end, y=score, colour=position)) +
#  geom_violin(data=subset(tpmt1_proc_wt, helix==1), draw_quantiles=0.5, scale = "width") +
#  geom_point(data=subset(tpmt1_proc_wt, helix==1), size=.3, alpha = 0.6, position=jitter2) + ylab("abundance") + xlab("variant aa") +
#  scale_y_continuous(limits = c(-0.8, 2.2))

#tpmt_end_scores_n <- ggplot(data=subset(tpmt1_proc_wt, secondary_struct=="neither"), aes(x=end, y=score, colour=position)) +
#  geom_violin(data=subset(tpmt1_proc_wt, secondary_struct=="neither"), draw_quantiles=0.5, scale = "width") +
#  geom_point(data=subset(tpmt1_proc_wt, secondary_struct=="neither"), size=.3, alpha = 0.6, position=jitter2) + ylab("abundance") + xlab("variant aa") +
#  scale_y_continuous(limits = c(-0.8, 2.2))

#plot(tpmt_end_scores_b)
#plot(tpmt_end_scores_a)
#plot(tpmt_end_scores_n)

#tpmt_end_scores_60 <- ggplot(data=subset(tpmt1_proc_wt, position<=60), aes(x=end, y=score, colour=position)) +
#  geom_violin(data=subset(tpmt1_proc_wt, position<=60), draw_quantiles=0.5, scale = "width") +
#  geom_point(data=subset(tpmt1_proc_wt, position<=60), size=.3, alpha = 0.6, position=jitter2) + ylab("abundance") + xlab("variant aa") +
#  scale_y_continuous(limits = c(-0.8, 2.2))
#tpmt_end_scores_120 <- ggplot(data=subset(tpmt1_proc_wt, position>60 & position<=120), aes(x=end, y=score, colour=position)) +
#  geom_violin(data=subset(tpmt1_proc_wt, position>60 & position<=120), draw_quantiles=0.5, scale = "width") +
#  geom_point(data=subset(tpmt1_proc_wt, position>60 & position<=120), size=.3, alpha = 0.6, position=jitter2) + ylab("abundance") + xlab("variant aa") +
#  scale_y_continuous(limits = c(-0.8, 2.2))
#tpmt_end_scores_180 <- ggplot(data=subset(tpmt1_proc_wt, position>120 & position<=180), aes(x=end, y=score, colour=position)) +
#  geom_violin(data=subset(tpmt1_proc_wt, position>120 & position<=180), draw_quantiles=0.5, scale = "width") +
#  geom_point(data=subset(tpmt1_proc_wt, position>120 & position<=180), size=.3, alpha = 0.6, position=jitter2) + ylab("abundance") + xlab("variant aa") +
#  scale_y_continuous(limits = c(-0.8, 2.2))
#tpmt_end_scores_245 <- ggplot(data=subset(tpmt1_proc_wt, position>180 & position<=245), aes(x=end, y=score, colour=position)) +
#  geom_violin(data=subset(tpmt1_proc_wt, position>180 & position<=245), draw_quantiles=0.5, scale = "width") +
#  geom_point(data=subset(tpmt1_proc_wt, position>180 & position<=245), size=.3, alpha = 0.6, position=jitter2) + ylab("abundance") + xlab("variant aa") +
#  scale_y_continuous(limits = c(-0.8, 2.2))

#plot(tpmt_end_scores_60)
#plot(tpmt_end_scores_120)
#plot(tpmt_end_scores_180)
#plot(tpmt_end_scores_245)
```


```{r}
set.seed(153)
jitter <- position_jitter(width = 1, height = NULL)
jitter1 <-position_jitter(width = 0.08, height = NULL)
jitter2 <- position_jitter(width=0.13, height = NULL)
twenty_color = c("#D02028", "#E1A12F", "#EDD941", "#F2F08E", "#EEC898", "#BCDDAE", "#A4C33B", "#76C158", "#85782E", "#315935", "#53958B", "#A1DAE0", "#486EB6", "#E6A3B4", "#C5A0CA", "#554DA0", "#99247E", "#402059", "#82421B", "#7E807E", 'black')

#pten_k_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=start)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "K"), aes(group=factor(position))) + xlab("Position in PTEN") + ggtitle("Lysine variant abundance scores")

pten_k_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "K"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Lysine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "K"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color)
pten_k_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "K"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Lysine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "K"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color)

pten_g_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "G"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Glycine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "G"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color)
pten_g_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "G"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Glycine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "G"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color)
pten_g_hydrodiff <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "G"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Glycine variant abundance scores w/ hydrophobicity change")+ geom_point(data=subset(pten1_proc_wt, start== "G"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_distiller(palette = "Spectral")

pten_c_spread <- ggplot(pten1_proc_wt, aes(y=score, x=position)) + geom_point(data=subset(pten1_proc_wt, start== "C")) + xlab("Position in PTEN") + ggtitle("Cysteine variant abundance scores")
#experiment_orig
#pten_c_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=position, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "C"), aes(group=position%%450), scale = "width") + xlab("Position in PTEN") + ggtitle("Cysteine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "C"), aes(x=position, y=score), alpha = 0.75, position=jitter) + scale_color_manual(values=c("#D02028", "#E1A12F", "#EDD941", "#F2F08E", "#EEC898", "#BCDDAE", "#A4C33B", "#76C158", "#85782E", "#315935", "#53958B", "#A1DAE0", "#486EB6", "#E6A3B4", "#C5A0CA", "#554DA0", "#99247E", "#402059", "#82421B", "#7E807E", 'black'))
#, scale = "count"
#+ geom_point(data=subset(pten1_proc_wt, start== "C"), aes(x=position, y=score), alpha = 0.5)
pten_c_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=position, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "C"), aes(group=position%%450), scale = "width") + xlab("Position in PTEN") + ggtitle("Cysteine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "C"), aes(x=position, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color)
pten_c_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "C"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Cysteine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "C"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color)

pten_c_hydrodiff <- ggplot(pten1_proc_wt, aes(y=score, x=position, colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "C"), aes(group=position%%450), scale = "width") + xlab("Position in PTEN") + ggtitle("Cysteine variant abundance scores w/ hydrophobicity change")+ geom_point(data=subset(pten1_proc_wt, start== "C"), aes(x=position, y=score), alpha = 0.75, position=jitter1) + scale_color_distiller(palette = "Spectral")
#in in geom_violin(aes()) -> colour = hydro1


#pten_s_spread <- ggplot(pten1_proc_wt, aes(y=score, x=position)) + geom_point(data=subset(pten1_proc_wt, start== "S")) + xlab("Position in PTEN") + ggtitle("Serine variant abundance scores")
#pten_s_spread1_old <- ggplot(pten1_proc_wt, aes(y=score, x=start)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=factor(position))) + xlab("Position in PTEN") + ggtitle("Serine variant abundance scores")
pten_s_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Serine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color)
pten_s_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Serine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color)

pten_s_hydrodiff <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Serine variant abundance scores w/ hydrophobicity change")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_distiller(palette = "Spectral")
#graphing abundance vs change in hydrophobicity
pten_s_hh_hydrodiff <- ggplot(pten1_proc_wt, aes(y=score, x=factor(hydro2-hydro1), colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=factor(hydro2-hydro1)), scale = "width") + xlab("Change in hydrophobicity") + ggtitle("Serine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=factor(hydro2-hydro1), y=score), alpha = 0.75, position=jitter1)
#pten_s_aa_hydrodiff <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = (hydro2-hydro1))) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Serine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=end, y=score), alpha = 0.75, position=jitter1)
#pten_s_aa_voldiff <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = voldiff)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Serine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=end, y=score), alpha = 0.75, position=jitter1)
#pten_s_aa_poldiff <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = polaritydiff)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Serine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=end, y=score), alpha = 0.75, position=jitter1)
#pten_s_aa_weightdiff <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = weightdiff)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "S"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Serine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "S"), aes(x=end, y=score), alpha = 0.75, position=jitter1)

#specific amino acid tests
##plot(pten_k_spread)
##plot(pten_g_spread)
##plot(pten_c_spread)

##plot(pten_s_spread)
##plot(pten_s_spread1_old)

#plot(pten_s_hh_hydrodiff) #probably not very useful... does not take into account position anymore
##plot(pten_s_aa_hydrodiff)
##plot(pten_s_aa_voldiff)
##plot(pten_s_aa_poldiff)
##plot(pten_s_aa_weightdiff)


plot(pten_c_spread1)
plot(pten_c_aa)
plot(pten_c_hydrodiff)
plot(pten_s_spread1)
plot(pten_s_aa)
plot(pten_s_hydrodiff)
plot(pten_g_spread1)
plot(pten_g_aa)
plot(pten_g_hydrodiff)
plot(pten_k_spread1)
plot(pten_k_aa)
```
```{r}
pten_a_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "A"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Alanine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "A"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color)
pten_a_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "A"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Alanine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "A"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color)

pten_r_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "R"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Arganine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "R"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color)
pten_r_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "R"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Arganine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "R"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color)

pten_n_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "N"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Asparagine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "N"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color)
pten_n_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "N"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Asparagine variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "N"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color)

pten_d_spread1 <- ggplot(pten1_proc_wt, aes(y=score, x=factor(position), colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "D"), aes(group=factor(position)), scale = "width") + xlab("Position in PTEN") + ggtitle("Aspartic Acid variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "D"), aes(x=factor(position), y=score), alpha = 0.85, position=jitter2) + scale_color_manual(values=twenty_color)
pten_d_aa <- ggplot(pten1_proc_wt, aes(y=score, x=end, colour = end)) + geom_violin(draw_quantiles=c(0.5), data=subset(pten1_proc_wt, start== "D"), aes(group=end), scale = "width") + xlab("Amino acid") + ggtitle("Aspartic Acid variant abundance scores")+ geom_point(data=subset(pten1_proc_wt, start== "D"), aes(x=end, y=score), alpha = 0.75, position=jitter1) + scale_color_manual(values=twenty_color)


plot(pten_a_spread1)
plot(pten_a_aa)
plot(pten_r_spread1)
plot(pten_r_aa)
plot(pten_n_spread1)
plot(pten_n_aa)
plot(pten_d_spread1)
plot(pten_d_aa)


```

```{r include=FALSE}
#on hold, creation of amino acid dataframe to plot abundance scores against
name <- c('Ala', 'Arg', 'Asn', 'Asp', 'Cys', 'Glu', 'Gln', 'Gly', 'His', 'Ile', 'Leu', 'Lys', 'Met', 'Phe', 'Pro', 'Ser', 'Thr', 'Trp', 'Tyr', 'Val')
quality <- c('Hydrophobic', 'Basic', 'Polar Neutral', 'Acidic', 'Polar Neutral', 'Acidic', 'Polar Neutral', 'Glycine', 'Basic', 'Hydrophobic', 'Hydrophobic', 'Basic', 'Hydrophobic', 'Hydrophobic', 'Hydrophobic', 'Polar Neutral', 'Polar Neutral', 'Hydrophobic', 'Hydrophobic', 'Hydrophobic')
#abundance <- get better scale
abundance <- c(0.0884, 0.057, 0.0417, 0.0539, 0.0124, 0.0624, 0.0382, 0.0703, 0.0220, 0.0595, 0.0994, 0.0527, 0.0237, 0.04, 0.0471, 0.0672, 0.0543, 0.0121, 0.03, 0.0677)
#isoelectric point <- unknown source (ncbi)
isoelectric <- c(6, 10.8, 5.4, 3, 5, 3.2, 5.7, 6, 7.6, 6, 6, 9.7, 5.7, 5.5, 6.3, 5.7, 5.6, 5.9, 5.7, 6.0)
hp_k_d <- c(1.8, -4.5, -3.5, -3.5, 2.5, -3.5, -3.5, -0.4, -3.2, 4.5, 3.8, -3.9, 1.9, 2.8, -1.6, -0.8, -0.7, -0.9, -1.3, 4.2)
hp_janin <-c(0.3, -1.4, -0.5, -0.6, 0.9, -0.7, -0.7, 0.3, -0.1, 0.7, 0.5, -1.8, 0.4, 0.5, -0.3, -0.1, -0.2, 0.3, -0.4, 0.6)
#Monera et al., J. Protein Sci (pro (-46) may be sketch)
hp_ph7 <- c(41, -14, -28, -55, 49, -31, -10, 0, 8, 99, 97, -23, 74, 100, -46, -5, 13, 97, 63, 76)
h_bonds <- c(0, 7, 5, 4, 0, 4, 5, 0, 3, 0, 0, 3, 0, 0, 0, 3, 3, 1, 3, 0)
mol_weight <-c(71, 156, 114, 115, 103, 129, 128, 57, 137, 113, 113, 128, 131, 147, 97, 87, 101, 186, 163, 99)

amino_acids.data <- data.frame(name, quality, abundance, isoelectric, hp_k_d, hp_janin, hp_ph7, h_bonds, mol_weight)

```

```{r include=FALSE}
img = readImage("/Users/go2alyssa/Desktop/density_plots.png")
display(img, method = "raster")
```
VAMP-seq abundance score density plots for PTEN (top) and TPMT (bottom) nonsense variants (blue dashed line), synonymous variants (red dashed line) and missense variants (filled, solid line). The missense variant densities are colored as gradients between the lowest 10% of abundance scores (blue), the WT abundance score (white) and abundance scores above WT (red).
```{r}
#Identifying items in tail to investigate
pten1_nonsense <- subset(pten1_proc, class == "nonsense")
tpmt1_nonsense <- subset(tpmt1_proc, class == "nonsense")
pten1_synon <- subset(pten1_proc, class == "synonymous")
tpmt1_synon <- subset(tpmt1_proc, class == "synonymous")

pten1_no_missense <- subset(pten1_proc, class == "synonymous" | class == "nonsense")

ggplot(pten1_nonsense, aes(x=score)) + geom_histogram(binwidth=.01, colour="blue", fill="white") 
#+ geom_density()
ggplot(pten1_synon, aes(x=score)) + geom_histogram(binwidth=.01, colour="red", fill="white")

ggplot(pten1_proc_wt, aes(x=score)) + geom_histogram(data=subset(pten1_proc_wt,class == "nonsense"), fill = "red", alpha = 0.5, binwidth=.01) + geom_histogram(data=subset(pten1_proc_wt,class == "synonymous"), fill = "blue", alpha = 0.5, binwidth=.01) + geom_histogram(data=subset(pten1_proc_wt,class == "missense"), fill = "green", alpha = 0.2, binwidth=.01)

ggplot(pten1_no_missense, aes(x=score)) + geom_histogram(data=subset(pten1_no_missense,class == "nonsense"), fill = "red", alpha = 0.5, binwidth=.01) + geom_histogram(data=subset(pten1_no_missense,class == "synonymous"), fill = "blue", alpha = 0.5, binwidth=.01)

ggplot(tpmt1_synon, aes(x=score)) + geom_histogram(binwidth=.01, colour="red", fill="white")
ggplot(tpmt1_nonsense, aes(x=score)) + geom_histogram(binwidth=.01, colour="blue", fill="white")
```
```{r}
#0.55
nonsense_tail <- subset(pten1_nonsense, score > 0.6)
synon_tail <- subset(pten1_synon, score < 0.6)
nonsense_tail$secondary_struct <- ifelse(is.na(nonsense_tail$helix), "unknown",
                        ifelse(nonsense_tail$helix==1, "helix",
                        ifelse(nonsense_tail$sheet==1, "sheet",
                        ifelse(nonsense_tail$helix==0, "neither",
                        "unknown"))))
synon_tail$secondary_struct <- ifelse(is.na(synon_tail$helix), "unknown",
                        ifelse(synon_tail$helix==1, "helix",
                        ifelse(synon_tail$sheet==1, "sheet",
                        ifelse(synon_tail$helix==0, "neither",
                        "unknown"))))

#data[row,column]
n_tail <- nonsense_tail[,c(1,2,7,30,127)]
s_tail <- synon_tail[,c(1,2,7,30,127)]
n_tail$bp_pos <- (n_tail$position-1)*3
s_tail$bp_pos <- (s_tail$position-1)*3

n_tail
s_tail
```
```{r}
#just in case there is a discernible pattern
s_tail_pos <- ggplot(s_tail, aes(x=position, y=score, colour=secondary_struct))+ geom_point(size=.3) + scale_x_continuous(minor_breaks = seq(0, 405, 5)) + scale_color_manual(values=c("#FF4848", "#00C853", "#5757FF", "#A9A9A9")) +ylab("VAMP-seq score")+xlab("Position in PTEN")+labs(colour="Secondary Structure")+ggtitle("PTEN synonymous variant tail scores in relation to protein structure") + geom_vline(xintercept=47, color="black", size=.1) + geom_vline(xintercept=78, color="black", size=.1) + geom_vline(xintercept=122.5, color="black", size=.1) + geom_vline(xintercept=140, color="black", size=.1) + geom_vline(xintercept=165, color="black", size=.1) + geom_vline(xintercept=194, color="black", size=.1) + geom_vline(xintercept=209, color="black", size=.1)
plot(s_tail_pos)

#help visualizing NMD rules
n_tail_pos <- ggplot(n_tail, aes(x=position, y=score, colour=secondary_struct))+ geom_point(size=.3) + scale_x_continuous(minor_breaks = seq(0, 405, 5)) + scale_color_manual(values=c("#FF4848", "#00C853", "#5757FF", "#A9A9A9")) +ylab("VAMP-seq score")+xlab("Position in PTEN")+labs(colour="Secondary Structure")+ggtitle("PTEN nonsense variant tail scores in relation to protein structure") + geom_vline(xintercept=47, color="black", size=.1) + geom_vline(xintercept=78, color="black", size=.1) + geom_vline(xintercept=122.5, color="black", size=.1) + geom_vline(xintercept=140, color="black", size=.1) + geom_vline(xintercept=165, color="black", size=.1) + geom_vline(xintercept=194, color="black", size=.1) + geom_vline(xintercept=209, color="black", size=.1)
plot(n_tail_pos)
```

```{r include=FALSE}
s_tail$prob_AG_GT <- c(0, 1/6, 1/2, 0, 1/2, 1/6)
s_tail$prob_titv <- c(0, 2/3, 2/3, 0, 2/3, 1/3)
ggplot(n_tail, aes(x=position,y=score)) + geom_point() + geom_smooth(method = "lm")
ggplot(s_tail, aes(x=prob_titv,y=score)) + geom_point() + geom_smooth(method = "lm")
ggplot(s_tail, aes(y=prob_titv,x=score)) + geom_point() + geom_smooth(method = "lm")
rsq <- function (x, y) cor(x, y)^2
n_rsq <- rsq(n_tail$position, s_tail$score)
s_rsq <- rsq(s_tail$prob_titv, s_tail$score)
n_rsq
s_rsq
#no relationship...
```

```{r include=FALSE}
# pten1_proc_wt$secondary_struct <- ifelse(is.na(pten1_proc_wt$helix), "unknown",
#                         ifelse(pten1_proc_wt$helix==1, "helix",
#                         ifelse(pten1_proc_wt$sheet==1, "sheet",
#                         ifelse(pten1_proc_wt$helix==0, "neither",
#                         "unknown"))))

#start position within pten gene
# n_tail$s_pos <- ifelse((n_tail$bp_pos_cum)>e1, (
#   ifelse((n_tail$bp_pos_cum) > (e1+e2), (
#     ifelse((n_tail$bp_pos_cum) > (e1+e2+e3), (
#       ifelse((n_tail$bp_pos_cum) > (e1+e2+e3), (
#       
#       ), (n_tail$bp_pos_cum+e4_s))
#     ), (n_tail$bp_pos_cum+e3_s))
#   ), (n_tail$bp_pos_cum+e2_s))
# ), (n_tail$bp_pos_cum+e1_s))

#end position within pten gene

#within 2 amino acids of junction


# #e1_s is the first bp of the first exon
# e1_s = 89624227
# #e1_e is the last bp of the first exon, 
# e1_e = 89624305
# #e1 is length in bp
# el = 79
# e2 = 85
# e3 = 45
# e4 = 44
# e5 = 239
# e6 = 142
# e7 = 167
# e8 = 225
# e9 = 186
# e2_s = 89653782
# e2_e = 89653866
# e3_s = 89685270	
# e3_e = 89685314	
# e4_s = 89690803
# e4_e = 89690846
# e5_s = 89692770	
# e5_e = 89693008
# e6_s = 89711875	
# e6_e = 89712016	
# e7_s = 89717610	
# e7_e = 89717776	
# e8_s = 89720651	
# e8_e = 89720875	
# e9_s = 89725044
# e9_e = 89725229
```
```{r include=FALSE}

gs_ls()
tpmt_ruddle <- gs_title("TPMT_ruddle")
tpmt_read <- gs_read(ss=tpmt_ruddle, ws = "ruddle_tpmt_variants")
tpmt_ruddle_data <- as.data.frame(tpmt_read)
```
```{r}
#reversing data to fit tpmt1_data
rever <- function(df=tpmt_ruddle_data){df<-df[dim(df)[1]:1,]}
tpmt_ruddle_data_rev = rever(tpmt_ruddle_data)

#creating variant column, equiv to tpmt1_data's
tpmt_ruddle_data_rev$variant <- do.call(paste, c(tpmt_ruddle_data_rev[c(5,24,6)], sep=""))

#making both tables smaller
tpmt_essential <- tpmt_ruddle_data_rev[,c(2,3,4,5,6,17,19,24,27,28,29,30,31,32,33,34,35,76,77,78,137)]
tpmt1_proc_ess <- tpmt1_proc_wt[,c(1,2,3,5,6,7,30,32,80)]

#merging tables with variant name
tpmt_merge <- merge(tpmt1_proc_ess, tpmt_essential, by="variant")

#comparing abundance scores with various scores in dbNSFP (contains annotations of all potential non-synonymous single-nucleotide variants (nsSNVs) in the human genome)
tpmt_cor1 <- ggplot(tpmt_merge, aes(x=score, y=as.numeric(SIFT_score)))+ geom_point(alpha = 0.2) + xlab("VAMP-seq score")+ylab("SIFT score")+ggtitle("1")
tpmt_cor1.5 <- ggplot(tpmt_merge, aes(x=score, y=as.numeric(SIFT_converted_rankscore)))+ geom_point(alpha = 0.2) + xlab("VAMP-seq score")+ylab("SIFT converted rankscore")+ggtitle("1.5")
tpmt_cor5 <- ggplot(tpmt_merge, aes(x=score, y=CADD_raw_rankscore))+ geom_point(alpha = 0.2) + xlab("VAMP-seq score")+ylab("CADD raw rankscore")+ggtitle("5")
tpmt_cor2 <- ggplot(tpmt_merge, aes(x=score, y=as.numeric(Polyphen2_HDIV_score)))+ geom_point(alpha = 0.2) + xlab("VAMP-seq score")+ylab("Polyphen2 HDIV score")+ggtitle("2")
tpmt_cor3 <- ggplot(tpmt_merge, aes(x=score, y=as.numeric(Polyphen2_HVAR_score)))+ geom_point(alpha = 0.2) + xlab("VAMP-seq score")+ylab("Polyphen2 HVAR score")+ggtitle("3")
tpmt_cor2.5 <- ggplot(tpmt_merge, aes(x=score, y=as.numeric(Polyphen2_HDIV_rankscore)))+ geom_point(alpha = 0.2) + xlab("VAMP-seq score")+ylab("Polyphen2 HDIV rankscore")+ggtitle("2.5")
tpmt_cor3.5 <- ggplot(tpmt_merge, aes(x=score, y=as.numeric(Polyphen2_HVAR_rankscore)))+ geom_point(alpha = 0.2) + xlab("VAMP-seq score")+ylab("Polyphen2 HVAR rankscore")+ggtitle("3.5")

#CADD_phred not worth

#plot(tpmt_cor5)
#plot(tpmt_cor1)
#plot(tpmt_cor1.5)
plot(tpmt_cor2)
plot(tpmt_cor3)
plot(tpmt_cor2.5)
plot(tpmt_cor3.5)
```
```{r}
TPMT_abun_CADD <- ggplot(tpmt_merge, aes(x=abundance_class, y=CADD_raw_rankscore)) + geom_violin(draw_quantiles = c( 0.5))+ylab("CADD raw rankscore")+xlab("Abundance Class")
plot(TPMT_abun_CADD)

TPMT_abun_SIFT_conv <- ggplot(tpmt_merge, aes(x=abundance_class, y=as.numeric(SIFT_converted_rankscore))) + geom_violin(draw_quantiles = c(0.5))+ylab("SIFT conv rankscore")+xlab("Abundance Class")
plot(TPMT_abun_SIFT_conv)

TPMT_abun_POLY <- ggplot(tpmt_merge, aes(x=abundance_class, y=as.numeric(Polyphen2_HDIV_rankscore))) + geom_violin(draw_quantiles = c( 0.5))+ylab("Polyphen2 HDIV rankscore")+xlab("Abundance Class")
plot(TPMT_abun_POLY)

TPMT_abun_POLY1 <- ggplot(tpmt_merge, aes(x=abundance_class, y=as.numeric(Polyphen2_HVAR_rankscore))) + geom_violin(draw_quantiles = c( 0.5))+ylab("Polyphen2 HVAR rankscore")+xlab("Abundance Class")
plot(TPMT_abun_POLY1)
```
```{r}

Pred_abun_SIFT <- ggplot(tpmt_merge, aes(abundance_class)) + geom_bar(aes(fill = SIFT_pred)) + ggtitle("Abundance class vs SIFT prediction of Damaging or Tolerated")
plot(Pred_abun_SIFT)

trial_sep <- tpmt_merge[c(21,23,24,26)]
tpmt_merge_expand <- separate_rows(tpmt_merge, c("Polyphen2_HDIV_score", "Polyphen2_HDIV_pred", "Polyphen2_HVAR_score", "Polyphen2_HVAR_pred"))

Pred_abun_HVAR <- ggplot(tpmt_merge_expand, aes(abundance_class)) + geom_bar(aes(fill = Polyphen2_HVAR_pred)) + ggtitle("Abundance class vs Polyphen2 HVAR predictions") + labs(subtitle = "D: Probably Damaging, P: Possibly Damaging, B: Benign")
plot(Pred_abun_HVAR)

```

```{r}
#creation of b-score text files for pymol use
pten_pymol <- summarySE(pten1_data, measurevar="score", groupvars="position", na.rm=TRUE)
#score[404] is wt
write.table(pten_pymol$score[1:403], "pten_mean_scores_pymol.txt", sep="\n", row.names=F, col.names=F, na = "NaN")

tpmt_pymol <- summarySE(tpmt1_data, measurevar="score", groupvars="position", na.rm=TRUE)
#score[404] is wt
write.table(tpmt_pymol$score[1:245], "tpmt_mean_scores_pymol.txt", sep="\n", row.names=F, col.names=F, na = "NaN")
```


